MaxPoolFusion

对输入张量执行最大池化操作

\[\text{output}_{b, h_o, w_o, c} = \operatorname{clip}\Bigg( \max_{h_i, w_i \in \mathcal{W}(h_o, w_o)} \Big( \text{input}_{b,\; h_i,\; w_i,\; c} \Big),\; \text{minf},\; \text{maxf} \Bigg)\]

其中，窗口区域 \(\mathcal{W}(h_o, w_o)\) 的定义如下：

\[ \begin{align}\begin{aligned}h_i = h_o \cdot \text{stride}_h - \text{pad}_u + \Delta h\\w_i = w_o \cdot \text{stride}_w - \text{pad}_l + \Delta w\\\Delta h \in [0,\ \text{win}_h - 1],\quad \Delta w \in [0,\ \text{win}_w - 1]\end{aligned}\end{align} \]

有效窗口点满足：

\[0 \le h_i < \text{in}_h,\qquad 0 \le w_i < \text{in}_w\]

原始窗口起点定义为：

\[h_{\text{start}} = h_o \cdot \text{stride}_h - \text{pad}_u\]

\[w_{\text{start}} = w_o \cdot \text{stride}_w - \text{pad}_l\]

合法采样范围为：

\[\Delta h \in \Big[ \max(0,\ -h_{\text{start}}),\; \min(\text{win}_h,\ \text{in}_h - h_{\text{start}}) \Big)\]

\[\Delta w \in \Big[ \max(0,\ -w_{\text{start}}),\; \min(\text{win}_w,\ \text{in}_w - w_{\text{start}}) \Big)\]

最大池化计算：

\[v_{\max} = \max_{\Delta h,\ \Delta w}\; \text{input}_{b,\; h_{\text{start}} + \Delta h,\; w_{\text{start}} + \Delta w,\; c}\]

最终输出：

\[\text{output}_{b, h_o, w_o, c} = \min\big(\max(v_{\max},\ \text{minf}),\ \text{maxf}\big)\]

输入：

input - 输入张量指针，采用 NHWC 格式，形状为 \([batch,\ in\_h,\ in\_w,\ channel]\)

params - 参数数组，包含以下元素：

in_w - 输入张量的宽度 (W)

in_h - 输入张量的高度 (H)

win_w - 池化窗口的宽度，即窗口在 W 方向的大小

win_h - 池化窗口的高度，即窗口在 H 方向的大小

output_w - 输出特征图的宽度

output_h - 输出特征图的高度

batch - 批次大小，即输入中的 batch 数

channel - 通道数 C ，每个池化位置都分别对 C 个通道独立执行最大池化与裁剪

stride_w - 池化窗口在 W 方向的步长

stride_h - 池化窗口在 H 方向的步长

pad_l - 输入特征图左侧的填充大小

pad_u - 输入特征图上侧的填充大小

minf - 输出结果的下界值，传指针

maxf - 输出结果的上界值，传指针

core_mask - 核心掩码，指定使用的计算核心

输出：

output - 输出张量指针，采用 NHWC 格式，形状为 \([batch,\ output\_h,\ output\_w,\ channel]\)。

支持平台：
FT78NE MT7004

备注

FT78NE 支持fp32, fp64

MT7004 支持fp16, fp32

调用时将除 core_mask 外的参数打包通过 long long params 数组传入，顺序为： input, output, in_w, in_h, win_w, win_h, output_w, output_h, batch, channel, stride_w, stride_h, pad_l, pad_u, minf, maxf

共享存储版本:

void hp_maxpool_fusion_s(half *input, half *output, long long *params, int core_mask)

void fp_maxpool_fusion_s(float *input, float *output, long long *params, int core_mask)

void dp_maxpool_fusion_s(double *input, double *output, long long *params, int core_mask)

C调用示例：

//FT78NE示例
#include <stdio.h>

int main(int argc, char* argv[]) {
    float* input_ptr = (float*)0xA0000000;
    float* output_ptr = (float*)0xB0000000;
    float* check_ptr = (float*)0xC0000000;
    int in_w = 32;
    int in_h = 32;
    int win_w = 6;
    int win_h = 6;
    int batch = 4;
    int channel = 2;
    int stride_w = 4;
    int stride_h = 4;
    int pad_l = 0;
    int pad_u = 0;
    float minf = 0.0f;
    float maxf = 50.0f;

    // 根据标准公式计算输出尺寸
    int dividor = in_w + pad_l * 2 - win_w;
    int output_w = (dividor + stride_w - 1) / stride_w + 1;
    int dividor2 = in_h + pad_u * 2 - win_h;
    int output_h = (dividor2 + stride_h - 1) / stride_h + 1;

    long long params[16];
    params[0] = (long long)in_w;
    params[1] = (long long)in_h;
    params[2] = (long long)win_w;
    params[3] = (long long)win_h;
    params[4] = (long long)output_w;
    params[5] = (long long)output_h;
    params[6] = (long long)batch;
    params[7] = (long long)channel;
    params[8] = (long long)stride_w;
    params[9] = (long long)stride_h;
    params[10] = (long long)pad_l;
    params[11] = (long long)pad_u;
    params[12] = (long long)&minf;
    params[13] = (long long)&maxf;
    int core_mask = 0x0f;

    fp_maxpool_fusion_s(input_ptr, output_ptr, params, core_mask);
    return 0;
}

私有存储版本:

void hp_maxpool_fusion_p(half *input, half *output, long long *params)

void fp_maxpool_fusion_p(float *input, float *output, long long *params)

void dp_maxpool_fusion_p(double *input, double *output, long long *params)

C调用示例：

//FT78NE示例
#include <stdio.h>

int main(int argc, char* argv[]) {
    float* input_ptr = (float*)0xA0000000;
    float* output_ptr = (float*)0xB0000000;
    float* check_ptr = (float*)0xC0000000;
    int in_w = 32;
    int in_h = 32;
    int win_w = 6;
    int win_h = 6;
    int batch = 4;
    int channel = 2;
    int stride_w = 4;
    int stride_h = 4;
    int pad_l = 0;
    int pad_u = 0;
    float minf = 0.0f;
    float maxf = 50.0f;

    // 根据标准公式计算输出尺寸
    int dividor = in_w + pad_l * 2 - win_w;
    int output_w = (dividor + stride_w - 1) / stride_w + 1;
    int dividor2 = in_h + pad_u * 2 - win_h;
    int output_h = (dividor2 + stride_h - 1) / stride_h + 1;

    long long params[16];
    params[0] = (long long)in_w;
    params[1] = (long long)in_h;
    params[2] = (long long)win_w;
    params[3] = (long long)win_h;
    params[4] = (long long)output_w;
    params[5] = (long long)output_h;
    params[6] = (long long)batch;
    params[7] = (long long)channel;
    params[8] = (long long)stride_w;
    params[9] = (long long)stride_h;
    params[10] = (long long)pad_l;
    params[11] = (long long)pad_u;
    params[12] = (long long)&minf;
    params[13] = (long long)&maxf;

    fp_maxpool_fusion_p(input_ptr, output_ptr, params);
    return 0;
}